library(readr)
library(plyr)
library(dplyr)

## this script creates weights to sample users
## after this sampling, weights are recalculated for the analyses

data_dir <- ""
matched_file <- ""
raw_file <- ""
raw_file_ca <- ""
in_path <- ""

thefiles <- paste0(
    data_dir,
    grep(
        "tsv",
        list.files(data_dir),
        value=T
    )
    )


matched_data <- read_csv(
    matched_file,
    quote="", comment="", trim_ws=T,
    col_types=cols(
        .default=col_character()
    )
) %>%
    select(
        twProfileID, first_name, ## last_name,
        birth_year, race, party_affiliation,
        gender, state
    )

raw_data_registered <- bind_rows(
    list(
        read_csv(
            raw_file,
            quote="", comment="", trim_ws=T,
            col_types=cols(
                .default=col_character()
            )
        ),
        read_csv(
            raw_file_ca,
            quote="", comment="", trim_ws=T,
            col_types=cols(
                .default=col_character()
            )
        )
    )
) %>% select(
    twProfileID, current_registration_status
) %>%
    filter(
        current_registration_status =="Registered"
    )

matched_data$registered <- as.integer(matched_data$twProfileID %in% raw_data_registered$twProfileID)


arabic_names <- read.csv(
    file.path(
        "/net/data/socdist/twitter_hng/",
        "arabic_names.csv"
    )
)

matched_data$arabic_name <- as.integer(
    matched_data$first_name %in% arabic_names$x
)

arabic_sample <- read.csv(
    paste0(in_path,"arabic_voters_with_approx_weights.csv")
    )

matched_data$arabic_sample <- as.integer(
    matched_data$twProfileID %in% arabic_sample$twProfileID
)

## last_names <- read.csv("~/data/app_c.csv") #census last name demographics
## last_names$name <- tolower(last_names$name)

## matched_data <- merge(
##     matched_data,
##     last_names,
##     by.x="last_name", by.y="name",
##     all.x=T
## )


matched_data$count <- rep(1, nrow(matched_data))

agg_df <- matched_data %>%
    mutate(
        birth_year_group = cut(
            as.numeric(birth_year),
            breaks=seq(1900, 2000, 5),
            right=F
        )
    ) %>%
    group_by(
        state,
        registered,
        gender,
        birth_year_group
    ) %>%
        dplyr::summarise(
            sum_arabic_name = sum(arabic_name),
            sum_arabic_sample = sum(arabic_sample),
            sum_on_twitter = sum(count)
        )

agg_df$weight <- with(
    agg_df,
    sum_arabic_sample / sum_on_twitter
)

matched_data_with_weights <- left_join(
    matched_data,
    agg_df,
    by=names(agg_df)[!(names(agg_df) %in% c("sum_on_twitter","sum_arabic_name","sum_arabic_sample","weight"))]
) %>% filter(is.finite(weight) & !is.na(weight))

matched_data_with_weights$approx_weight <- matched_data_with_weights$weight
## matched_data_with_weights$approx_weight <- with(
##     ## try to sample fewer minority groups (did not appear to work very well, since only rare names available)
##     matched_data_with_weights, weight * as.numeric(as.character(pctwhite)) / 100
## )

matched_data_with_weights$approx_weight[is.na(matched_data_with_weights$approx_weight)] <- 0
matched_data_with_weights$approx_weight[matched_data_with_weights$registered==0] <- 0

## multiplies by constant
matched_data_with_weights$approx_weight[matched_data_with_weights$arabic_name==0] <- matched_data_with_weights$approx_weight[matched_data_with_weights$arabic_name==0] * sum(matched_data_with_weights$approx_weight) / sum(matched_data_with_weights$approx_weight[matched_data_with_weights$arabic_name==0])
matched_data_with_weights$approx_weight[matched_data_with_weights$arabic_name==1] <- 0

write.csv(
    matched_data_with_weights %>% select(-weight, -first_name),
    file=paste0(in_path, "arabic_voters_comparison_weights.csv"),
    row.names=F
)
